In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
In [2]:
large_train = "./424_F2024_Final_PC_large_train_v1.csv"
test_path = "./424_F2024_Final_PC_test_without_response_v1.csv"
model_path = "my_model_500epoch.h5"

df_train = pd.read_csv(large_train)
df_test = pd.read_csv(test_path)
In [3]:
df_train.sample(5)
Out[3]:
rating year_review firm job_title headline pros cons
283563 3 2018 Oracle Large company large company with impressive market share large company more focused on margin than profit
408546 5 2016 PwC Senior Associate They have great benefits and really help grow ... Great career growth, travel opportunities, and... Big company and a lot of red tape to get throu...
369588 4 2021 Goldman-Sachs Strategist Good company to start your career Good Perks, Culcure for junior to grow, Good M... Benefit is not comparable to other banks
253300 4 2021 McDonald-s Mcdonalds Fry Cook It was ok for a first job decent hours and good management low opportunity to move up
132776 2 2020 Deloitte Senior Consultant Not a good company Nothing much to say about this company Not a good place to work
In [4]:
df_test.sample(5)
Out[4]:
rating year_review firm job_title headline pros cons
24179 NaN 2017 J-P-Morgan Software Engineer Unjust Dismissal None that I can think of in the department I w... I was dismissed on totally fabricated charges....
25271 NaN 2017 Citi Anonymous Employee Former Corporate Banking Executive at Citigroup A never say die attitude which helps push the ... Large organization and has associated red tape...
76017 NaN 2017 IBM Business Analyst Good work culture - Good environment to work in\r- Lots of flexi... - Management not good.\r- Not hikes , appraisa...
54714 NaN 2021 EY Advisory Senior Consultant Great for Professional Advancement Extensive resources and knowledge at your disp... Very little consistency with upper management....
41838 NaN 2017 British-Airways PMO Manager Actively seeking employment elsewhere People that are enthusiastic about the commerc... Run by accountants that know better than the p...
In [5]:
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isnull(text):
        return 'Unknown'
    text = text.encode('ascii', 'ignore').decode()  
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower().strip()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Preprocess the training and test data
def preprocess_data(df_train, df_test):
    # Clean text columns
    for col in ['pros', 'cons', 'headline']:
        df_train[col] = df_train[col].apply(clean_text)
        df_test[col] = df_test[col].apply(clean_text)

    # Combine text columns for vectorization
    combined_text_train = df_train['pros'] + " " + df_train['cons'] + " " + df_train['headline']
    combined_text_test = df_test['pros'] + " " + df_test['cons'] + " " + df_test['headline']

    # Initialize the TF-IDF vectorizer with an increased max_features
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjusted to capture more information

    # Fit and transform the training data, transform the test data
    combined_tfidf_train = tfidf_vectorizer.fit_transform(combined_text_train).toarray()
    combined_tfidf_test = tfidf_vectorizer.transform(combined_text_test).toarray()

    # Handle missing or empty values in 'job_title' and 'firm'
    df_train['job_title'] = df_train['job_title'].fillna('Unknown')
    df_test['job_title'] = df_test['job_title'].fillna('Unknown')
    df_train['firm'] = df_train['firm'].fillna('Unknown')
    df_test['firm'] = df_test['firm'].fillna('Unknown')

    # Encode 'firm' using LabelEncoder
    label_encoder_firm = LabelEncoder()
    all_firm_values = pd.concat([df_train['firm'], df_test['firm']])
    label_encoder_firm.fit(all_firm_values)
    df_train['firm_encoded'] = label_encoder_firm.transform(df_train['firm'])
    df_test['firm_encoded'] = label_encoder_firm.transform(df_test['firm'])

    # Convert 'rating' to a numeric target for regression
    y_train = df_train['rating'].values
    y_test = df_test['rating'].values

    # Combine all the features for both training and test sets
    X_train = np.hstack([
        combined_tfidf_train,
        df_train[['firm_encoded']].values
    ]).astype('float32')

    X_test = np.hstack([
        combined_tfidf_test,
        df_test[['firm_encoded']].values
    ]).astype('float32')

    # Feature scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Return processed features and labels
    return X_train, X_test, y_train, y_test

# Preprocess the data
X_train, X_test, y_train, y_test = preprocess_data(df_train, df_test)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mayank/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mayank/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
In [6]:
X_train.shape, X_test.shape
Out[6]:
((500000, 1001), (100000, 1001))
In [7]:
X_train[:5]
Out[7]:
array([[-0.08589483, -0.05596118, -0.05692343, ..., -0.13509741,
        -0.05917969, -1.2568026 ],
       [-0.08589483, -0.05596118, -0.05692343, ..., -0.13509741,
        14.924771  , -0.19707541],
       [-0.08589483, -0.05596118, -0.05692343, ..., -0.13509741,
        -0.05917969, -1.2666148 ],
       [-0.08589483, -0.05596118, -0.05692343, ..., -0.13509741,
        -0.05917969, -1.7964784 ],
       [-0.08589483, -0.05596118, -0.05692343, ..., -0.13509741,
        -0.05917969, -0.4325703 ]], dtype=float32)
In [8]:
# Build the model with adjusted architecture
model = Sequential([
    Dense(256, input_dim=X_train.shape[1], kernel_regularizer=l2(0.001)),  # Added L2 regularization
    BatchNormalization(),
    LeakyReLU(alpha=0.1),  # LeakyReLU for better gradient flow
    Dropout(0.25),  # Fine-tuned dropout rate

    Dense(128, kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    LeakyReLU(alpha=0.1),
    Dropout(0.2),

    Dense(64, kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    LeakyReLU(alpha=0.1),
    Dropout(0.15),

    Dense(32, kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    LeakyReLU(alpha=0.1),

    Dense(1, activation='linear')  # Linear activation for regression
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='mean_squared_error',
              metrics=['mse'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_mse', patience=30, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_mse', factor=0.5, patience=15, min_lr=0.00001)

# Fit the model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=500,  # Increased epochs for better learning
    batch_size=64,  # Option to try other sizes
    callbacks=[early_stopping, reduce_lr],
    verbose=2
)

# Save the trained model
model.save(model_path)  # Save entire model to HDF5 format
WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.
WARNING:absl:There is a known slowdown when using v2.11+ Keras optimizers on M1/M2 Macs. Falling back to the legacy Keras optimizer, i.e., `tf.keras.optimizers.legacy.Adam`.
Epoch 1/500
2024-12-09 08:56:55.639082: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
6250/6250 - 13s - loss: 1.4434 - mse: 1.0142 - val_loss: 0.9616 - val_mse: 0.7804 - lr: 0.0010 - 13s/epoch - 2ms/step
Epoch 2/500
Epoch 2/500
6250/6250 - 11s - loss: 0.9546 - mse: 0.8024 - val_loss: 0.8940 - val_mse: 0.7603 - lr: 0.0010 - 11s/epoch - 2ms/step
Epoch 3/500
6250/6250 - 11s - loss: 0.9060 - mse: 0.7898 - val_loss: 0.8664 - val_mse: 0.7658 - lr: 0.0010 - 11s/epoch - 2ms/step
Epoch 4/500
6250/6250 - 10s - loss: 0.8753 - mse: 0.7846 - val_loss: 0.8307 - val_mse: 0.7471 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 5/500
6250/6250 - 11s - loss: 0.8586 - mse: 0.7817 - val_loss: 0.8162 - val_mse: 0.7439 - lr: 0.0010 - 11s/epoch - 2ms/step
Epoch 6/500
6250/6250 - 11s - loss: 0.8497 - mse: 0.7788 - val_loss: 0.8134 - val_mse: 0.7445 - lr: 0.0010 - 11s/epoch - 2ms/step
Epoch 7/500
6250/6250 - 11s - loss: 0.8447 - mse: 0.7768 - val_loss: 0.8168 - val_mse: 0.7496 - lr: 0.0010 - 11s/epoch - 2ms/step
Epoch 8/500
6250/6250 - 10s - loss: 0.8423 - mse: 0.7768 - val_loss: 0.8072 - val_mse: 0.7430 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 9/500
6250/6250 - 10s - loss: 0.8396 - mse: 0.7751 - val_loss: 0.8064 - val_mse: 0.7428 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 10/500
6250/6250 - 10s - loss: 0.8385 - mse: 0.7744 - val_loss: 0.8106 - val_mse: 0.7460 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 11/500
6250/6250 - 10s - loss: 0.8365 - mse: 0.7734 - val_loss: 0.8068 - val_mse: 0.7446 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 12/500
6250/6250 - 10s - loss: 0.8368 - mse: 0.7740 - val_loss: 0.8031 - val_mse: 0.7392 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 13/500
6250/6250 - 10s - loss: 0.8361 - mse: 0.7728 - val_loss: 0.8086 - val_mse: 0.7453 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 14/500
6250/6250 - 10s - loss: 0.8360 - mse: 0.7728 - val_loss: 0.8009 - val_mse: 0.7381 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 15/500
6250/6250 - 10s - loss: 0.8355 - mse: 0.7722 - val_loss: 0.8044 - val_mse: 0.7417 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 16/500
6250/6250 - 10s - loss: 0.8361 - mse: 0.7736 - val_loss: 0.8105 - val_mse: 0.7470 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 17/500
6250/6250 - 10s - loss: 0.8358 - mse: 0.7733 - val_loss: 0.8045 - val_mse: 0.7416 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 18/500
6250/6250 - 10s - loss: 0.8340 - mse: 0.7721 - val_loss: 0.7989 - val_mse: 0.7378 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 19/500
6250/6250 - 10s - loss: 0.8353 - mse: 0.7730 - val_loss: 0.8037 - val_mse: 0.7407 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 20/500
6250/6250 - 9s - loss: 0.8351 - mse: 0.7723 - val_loss: 0.8055 - val_mse: 0.7429 - lr: 0.0010 - 9s/epoch - 2ms/step
Epoch 21/500
6250/6250 - 10s - loss: 0.8355 - mse: 0.7726 - val_loss: 0.8077 - val_mse: 0.7439 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 22/500
6250/6250 - 10s - loss: 0.8346 - mse: 0.7716 - val_loss: 0.8124 - val_mse: 0.7499 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 23/500
6250/6250 - 10s - loss: 0.8358 - mse: 0.7732 - val_loss: 0.8053 - val_mse: 0.7420 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 24/500
6250/6250 - 10s - loss: 0.8359 - mse: 0.7734 - val_loss: 0.8060 - val_mse: 0.7420 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 25/500
6250/6250 - 11s - loss: 0.8344 - mse: 0.7727 - val_loss: 0.8039 - val_mse: 0.7429 - lr: 0.0010 - 11s/epoch - 2ms/step
Epoch 26/500
6250/6250 - 10s - loss: 0.8333 - mse: 0.7714 - val_loss: 0.8129 - val_mse: 0.7492 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 27/500
6250/6250 - 10s - loss: 0.8345 - mse: 0.7725 - val_loss: 0.8037 - val_mse: 0.7426 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 28/500
6250/6250 - 10s - loss: 0.8335 - mse: 0.7723 - val_loss: 0.7996 - val_mse: 0.7403 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 29/500
6250/6250 - 10s - loss: 0.8356 - mse: 0.7728 - val_loss: 0.8026 - val_mse: 0.7411 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 30/500
6250/6250 - 10s - loss: 0.8340 - mse: 0.7723 - val_loss: 0.8043 - val_mse: 0.7440 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 31/500
6250/6250 - 10s - loss: 0.8331 - mse: 0.7721 - val_loss: 0.8125 - val_mse: 0.7526 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 32/500
6250/6250 - 10s - loss: 0.8329 - mse: 0.7717 - val_loss: 0.8090 - val_mse: 0.7482 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 33/500
6250/6250 - 10s - loss: 0.8340 - mse: 0.7723 - val_loss: 0.8097 - val_mse: 0.7494 - lr: 0.0010 - 10s/epoch - 2ms/step
Epoch 34/500
6250/6250 - 10s - loss: 0.8036 - mse: 0.7554 - val_loss: 0.7724 - val_mse: 0.7301 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 35/500
6250/6250 - 10s - loss: 0.7945 - mse: 0.7531 - val_loss: 0.7705 - val_mse: 0.7300 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 36/500
6250/6250 - 10s - loss: 0.7923 - mse: 0.7525 - val_loss: 0.7651 - val_mse: 0.7263 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 37/500
6250/6250 - 10s - loss: 0.7926 - mse: 0.7531 - val_loss: 0.7703 - val_mse: 0.7307 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 38/500
6250/6250 - 10s - loss: 0.7916 - mse: 0.7518 - val_loss: 0.7666 - val_mse: 0.7269 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 39/500
6250/6250 - 10s - loss: 0.7911 - mse: 0.7510 - val_loss: 0.7657 - val_mse: 0.7261 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 40/500
6250/6250 - 10s - loss: 0.7914 - mse: 0.7514 - val_loss: 0.7726 - val_mse: 0.7328 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 41/500
6250/6250 - 10s - loss: 0.7920 - mse: 0.7519 - val_loss: 0.7675 - val_mse: 0.7275 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 42/500
6250/6250 - 10s - loss: 0.7925 - mse: 0.7521 - val_loss: 0.7730 - val_mse: 0.7323 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 43/500
6250/6250 - 10s - loss: 0.7916 - mse: 0.7512 - val_loss: 0.7691 - val_mse: 0.7291 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 44/500
6250/6250 - 10s - loss: 0.7902 - mse: 0.7496 - val_loss: 0.7709 - val_mse: 0.7301 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 45/500
6250/6250 - 10s - loss: 0.7914 - mse: 0.7507 - val_loss: 0.7689 - val_mse: 0.7285 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 46/500
6250/6250 - 10s - loss: 0.7913 - mse: 0.7508 - val_loss: 0.7675 - val_mse: 0.7271 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 47/500
6250/6250 - 10s - loss: 0.7912 - mse: 0.7505 - val_loss: 0.7687 - val_mse: 0.7280 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 48/500
6250/6250 - 10s - loss: 0.7913 - mse: 0.7503 - val_loss: 0.7668 - val_mse: 0.7254 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 49/500
6250/6250 - 10s - loss: 0.7912 - mse: 0.7502 - val_loss: 0.7660 - val_mse: 0.7249 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 50/500
6250/6250 - 10s - loss: 0.7914 - mse: 0.7504 - val_loss: 0.7674 - val_mse: 0.7268 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 51/500
6250/6250 - 10s - loss: 0.7910 - mse: 0.7502 - val_loss: 0.7734 - val_mse: 0.7320 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 52/500
6250/6250 - 10s - loss: 0.7901 - mse: 0.7494 - val_loss: 0.7654 - val_mse: 0.7249 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 53/500
6250/6250 - 10s - loss: 0.7910 - mse: 0.7501 - val_loss: 0.7684 - val_mse: 0.7280 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 54/500
6250/6250 - 10s - loss: 0.7911 - mse: 0.7505 - val_loss: 0.7638 - val_mse: 0.7235 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 55/500
6250/6250 - 10s - loss: 0.7911 - mse: 0.7503 - val_loss: 0.7698 - val_mse: 0.7289 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 56/500
6250/6250 - 10s - loss: 0.7913 - mse: 0.7503 - val_loss: 0.7682 - val_mse: 0.7265 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 57/500
6250/6250 - 10s - loss: 0.7907 - mse: 0.7494 - val_loss: 0.7686 - val_mse: 0.7270 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 58/500
6250/6250 - 10s - loss: 0.7918 - mse: 0.7509 - val_loss: 0.7682 - val_mse: 0.7275 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 59/500
6250/6250 - 10s - loss: 0.7911 - mse: 0.7498 - val_loss: 0.7729 - val_mse: 0.7315 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 60/500
6250/6250 - 10s - loss: 0.7909 - mse: 0.7500 - val_loss: 0.7660 - val_mse: 0.7257 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 61/500
6250/6250 - 10s - loss: 0.7920 - mse: 0.7512 - val_loss: 0.7668 - val_mse: 0.7254 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 62/500
6250/6250 - 10s - loss: 0.7901 - mse: 0.7490 - val_loss: 0.7713 - val_mse: 0.7299 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 63/500
6250/6250 - 10s - loss: 0.7910 - mse: 0.7497 - val_loss: 0.7689 - val_mse: 0.7271 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 64/500
6250/6250 - 10s - loss: 0.7927 - mse: 0.7514 - val_loss: 0.7652 - val_mse: 0.7238 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 65/500
6250/6250 - 10s - loss: 0.7920 - mse: 0.7504 - val_loss: 0.7692 - val_mse: 0.7285 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 66/500
6250/6250 - 10s - loss: 0.7907 - mse: 0.7499 - val_loss: 0.7678 - val_mse: 0.7265 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 67/500
6250/6250 - 10s - loss: 0.7908 - mse: 0.7495 - val_loss: 0.7686 - val_mse: 0.7264 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 68/500
6250/6250 - 10s - loss: 0.7910 - mse: 0.7497 - val_loss: 0.7676 - val_mse: 0.7263 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 69/500
6250/6250 - 10s - loss: 0.7902 - mse: 0.7492 - val_loss: 0.7665 - val_mse: 0.7258 - lr: 5.0000e-04 - 10s/epoch - 2ms/step
Epoch 70/500
6250/6250 - 10s - loss: 0.7719 - mse: 0.7358 - val_loss: 0.7513 - val_mse: 0.7179 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 71/500
6250/6250 - 11s - loss: 0.7635 - mse: 0.7318 - val_loss: 0.7458 - val_mse: 0.7155 - lr: 2.5000e-04 - 11s/epoch - 2ms/step
Epoch 72/500
6250/6250 - 11s - loss: 0.7615 - mse: 0.7319 - val_loss: 0.7464 - val_mse: 0.7176 - lr: 2.5000e-04 - 11s/epoch - 2ms/step
Epoch 73/500
6250/6250 - 10s - loss: 0.7597 - mse: 0.7310 - val_loss: 0.7492 - val_mse: 0.7208 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 74/500
6250/6250 - 11s - loss: 0.7587 - mse: 0.7304 - val_loss: 0.7459 - val_mse: 0.7179 - lr: 2.5000e-04 - 11s/epoch - 2ms/step
Epoch 75/500
6250/6250 - 11s - loss: 0.7580 - mse: 0.7300 - val_loss: 0.7417 - val_mse: 0.7138 - lr: 2.5000e-04 - 11s/epoch - 2ms/step
Epoch 76/500
6250/6250 - 10s - loss: 0.7587 - mse: 0.7308 - val_loss: 0.7438 - val_mse: 0.7161 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 77/500
6250/6250 - 10s - loss: 0.7588 - mse: 0.7308 - val_loss: 0.7443 - val_mse: 0.7161 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 78/500
6250/6250 - 10s - loss: 0.7572 - mse: 0.7293 - val_loss: 0.7432 - val_mse: 0.7153 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 79/500
6250/6250 - 10s - loss: 0.7573 - mse: 0.7292 - val_loss: 0.7444 - val_mse: 0.7163 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 80/500
6250/6250 - 10s - loss: 0.7578 - mse: 0.7296 - val_loss: 0.7442 - val_mse: 0.7160 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 81/500
6250/6250 - 10s - loss: 0.7572 - mse: 0.7291 - val_loss: 0.7453 - val_mse: 0.7173 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 82/500
6250/6250 - 10s - loss: 0.7574 - mse: 0.7291 - val_loss: 0.7440 - val_mse: 0.7155 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 83/500
6250/6250 - 10s - loss: 0.7568 - mse: 0.7283 - val_loss: 0.7453 - val_mse: 0.7169 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 84/500
6250/6250 - 10s - loss: 0.7568 - mse: 0.7284 - val_loss: 0.7460 - val_mse: 0.7175 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 85/500
6250/6250 - 10s - loss: 0.7576 - mse: 0.7291 - val_loss: 0.7456 - val_mse: 0.7171 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 86/500
6250/6250 - 10s - loss: 0.7567 - mse: 0.7281 - val_loss: 0.7439 - val_mse: 0.7153 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 87/500
6250/6250 - 10s - loss: 0.7570 - mse: 0.7284 - val_loss: 0.7460 - val_mse: 0.7174 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 88/500
6250/6250 - 10s - loss: 0.7580 - mse: 0.7292 - val_loss: 0.7441 - val_mse: 0.7154 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 89/500
6250/6250 - 10s - loss: 0.7570 - mse: 0.7282 - val_loss: 0.7450 - val_mse: 0.7164 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 90/500
6250/6250 - 10s - loss: 0.7569 - mse: 0.7281 - val_loss: 0.7460 - val_mse: 0.7168 - lr: 2.5000e-04 - 10s/epoch - 2ms/step
Epoch 91/500
6250/6250 - 10s - loss: 0.7446 - mse: 0.7174 - val_loss: 0.7374 - val_mse: 0.7113 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 92/500
6250/6250 - 10s - loss: 0.7396 - mse: 0.7141 - val_loss: 0.7358 - val_mse: 0.7111 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 93/500
6250/6250 - 10s - loss: 0.7368 - mse: 0.7127 - val_loss: 0.7372 - val_mse: 0.7135 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 94/500
6250/6250 - 10s - loss: 0.7350 - mse: 0.7117 - val_loss: 0.7357 - val_mse: 0.7127 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 95/500
6250/6250 - 10s - loss: 0.7340 - mse: 0.7114 - val_loss: 0.7337 - val_mse: 0.7114 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 96/500
6250/6250 - 10s - loss: 0.7339 - mse: 0.7120 - val_loss: 0.7348 - val_mse: 0.7131 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 97/500
6250/6250 - 10s - loss: 0.7317 - mse: 0.7102 - val_loss: 0.7347 - val_mse: 0.7133 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 98/500
6250/6250 - 10s - loss: 0.7323 - mse: 0.7111 - val_loss: 0.7345 - val_mse: 0.7134 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 99/500
6250/6250 - 10s - loss: 0.7327 - mse: 0.7118 - val_loss: 0.7325 - val_mse: 0.7117 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 100/500
6250/6250 - 10s - loss: 0.7325 - mse: 0.7117 - val_loss: 0.7319 - val_mse: 0.7112 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 101/500
6250/6250 - 10s - loss: 0.7312 - mse: 0.7105 - val_loss: 0.7322 - val_mse: 0.7116 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 102/500
6250/6250 - 10s - loss: 0.7317 - mse: 0.7112 - val_loss: 0.7337 - val_mse: 0.7133 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 103/500
6250/6250 - 10s - loss: 0.7312 - mse: 0.7107 - val_loss: 0.7330 - val_mse: 0.7125 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 104/500
6250/6250 - 10s - loss: 0.7303 - mse: 0.7098 - val_loss: 0.7327 - val_mse: 0.7122 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 105/500
6250/6250 - 10s - loss: 0.7302 - mse: 0.7097 - val_loss: 0.7338 - val_mse: 0.7132 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 106/500
6250/6250 - 10s - loss: 0.7302 - mse: 0.7097 - val_loss: 0.7331 - val_mse: 0.7128 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 107/500
6250/6250 - 10s - loss: 0.7302 - mse: 0.7098 - val_loss: 0.7342 - val_mse: 0.7138 - lr: 1.2500e-04 - 10s/epoch - 2ms/step
Epoch 108/500
6250/6250 - 10s - loss: 0.7216 - mse: 0.7017 - val_loss: 0.7319 - val_mse: 0.7123 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 109/500
6250/6250 - 10s - loss: 0.7185 - mse: 0.6992 - val_loss: 0.7304 - val_mse: 0.7114 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 110/500
6250/6250 - 10s - loss: 0.7176 - mse: 0.6988 - val_loss: 0.7295 - val_mse: 0.7109 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 111/500
6250/6250 - 10s - loss: 0.7162 - mse: 0.6978 - val_loss: 0.7301 - val_mse: 0.7119 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 112/500
6250/6250 - 10s - loss: 0.7157 - mse: 0.6977 - val_loss: 0.7294 - val_mse: 0.7116 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 113/500
6250/6250 - 10s - loss: 0.7145 - mse: 0.6969 - val_loss: 0.7296 - val_mse: 0.7121 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 114/500
6250/6250 - 10s - loss: 0.7139 - mse: 0.6966 - val_loss: 0.7287 - val_mse: 0.7115 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 115/500
6250/6250 - 10s - loss: 0.7150 - mse: 0.6979 - val_loss: 0.7300 - val_mse: 0.7129 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 116/500
6250/6250 - 10s - loss: 0.7140 - mse: 0.6971 - val_loss: 0.7295 - val_mse: 0.7127 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 117/500
6250/6250 - 10s - loss: 0.7129 - mse: 0.6962 - val_loss: 0.7296 - val_mse: 0.7130 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 118/500
6250/6250 - 10s - loss: 0.7128 - mse: 0.6963 - val_loss: 0.7287 - val_mse: 0.7123 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 119/500
6250/6250 - 10s - loss: 0.7124 - mse: 0.6961 - val_loss: 0.7284 - val_mse: 0.7122 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 120/500
6250/6250 - 10s - loss: 0.7128 - mse: 0.6967 - val_loss: 0.7275 - val_mse: 0.7115 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 121/500
6250/6250 - 10s - loss: 0.7128 - mse: 0.6968 - val_loss: 0.7283 - val_mse: 0.7124 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 122/500
6250/6250 - 10s - loss: 0.7121 - mse: 0.6962 - val_loss: 0.7290 - val_mse: 0.7133 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 123/500
6250/6250 - 10s - loss: 0.7126 - mse: 0.6968 - val_loss: 0.7282 - val_mse: 0.7125 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 124/500
6250/6250 - 10s - loss: 0.7113 - mse: 0.6957 - val_loss: 0.7291 - val_mse: 0.7135 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 125/500
6250/6250 - 10s - loss: 0.7117 - mse: 0.6962 - val_loss: 0.7277 - val_mse: 0.7123 - lr: 6.2500e-05 - 10s/epoch - 2ms/step
Epoch 126/500
6250/6250 - 10s - loss: 0.7057 - mse: 0.6905 - val_loss: 0.7272 - val_mse: 0.7120 - lr: 3.1250e-05 - 10s/epoch - 2ms/step
Epoch 127/500
6250/6250 - 9s - loss: 0.7043 - mse: 0.6893 - val_loss: 0.7274 - val_mse: 0.7125 - lr: 3.1250e-05 - 9s/epoch - 2ms/step
Epoch 128/500
6250/6250 - 9s - loss: 0.7032 - mse: 0.6884 - val_loss: 0.7266 - val_mse: 0.7118 - lr: 3.1250e-05 - 9s/epoch - 2ms/step
Epoch 129/500
6250/6250 - 10s - loss: 0.7023 - mse: 0.6876 - val_loss: 0.7268 - val_mse: 0.7123 - lr: 3.1250e-05 - 10s/epoch - 2ms/step
Epoch 130/500
6250/6250 - 9s - loss: 0.7019 - mse: 0.6874 - val_loss: 0.7270 - val_mse: 0.7126 - lr: 3.1250e-05 - 9s/epoch - 1ms/step
Epoch 131/500
6250/6250 - 9s - loss: 0.7010 - mse: 0.6867 - val_loss: 0.7280 - val_mse: 0.7137 - lr: 3.1250e-05 - 9s/epoch - 1ms/step
Epoch 132/500
6250/6250 - 10s - loss: 0.7015 - mse: 0.6873 - val_loss: 0.7267 - val_mse: 0.7126 - lr: 3.1250e-05 - 10s/epoch - 2ms/step
Epoch 133/500
6250/6250 - 9s - loss: 0.7002 - mse: 0.6862 - val_loss: 0.7269 - val_mse: 0.7130 - lr: 3.1250e-05 - 9s/epoch - 2ms/step
Epoch 134/500
6250/6250 - 10s - loss: 0.7009 - mse: 0.6871 - val_loss: 0.7275 - val_mse: 0.7137 - lr: 3.1250e-05 - 10s/epoch - 2ms/step
Epoch 135/500
6250/6250 - 10s - loss: 0.7006 - mse: 0.6869 - val_loss: 0.7267 - val_mse: 0.7131 - lr: 3.1250e-05 - 10s/epoch - 2ms/step
Epoch 136/500
6250/6250 - 10s - loss: 0.7007 - mse: 0.6871 - val_loss: 0.7269 - val_mse: 0.7134 - lr: 3.1250e-05 - 10s/epoch - 2ms/step
Epoch 137/500
6250/6250 - 9s - loss: 0.6992 - mse: 0.6857 - val_loss: 0.7272 - val_mse: 0.7139 - lr: 3.1250e-05 - 9s/epoch - 2ms/step
Epoch 138/500
6250/6250 - 10s - loss: 0.6985 - mse: 0.6852 - val_loss: 0.7278 - val_mse: 0.7145 - lr: 3.1250e-05 - 10s/epoch - 2ms/step
Epoch 139/500
6250/6250 - 10s - loss: 0.6992 - mse: 0.6860 - val_loss: 0.7271 - val_mse: 0.7140 - lr: 3.1250e-05 - 10s/epoch - 2ms/step
Epoch 140/500
6250/6250 - 10s - loss: 0.6995 - mse: 0.6864 - val_loss: 0.7273 - val_mse: 0.7143 - lr: 3.1250e-05 - 10s/epoch - 2ms/step
In [9]:
model = load_model(model_path) 
In [10]:
y_pred = model.predict(X_train)

print("Predictions on the test set:")
print(y_pred)

# %%
# Calculate Mean Squared Error (MSE)
mse_train = mean_squared_error(y_train, y_pred)

# Calculate R-squared
r2_train = r2_score(y_train, y_pred)

# Print the results
print(f"Training MSE: {mse_train}")
print(f"Training R-squared: {r2_train}")

# %%
y_pred_test = model.predict(X_test)
print(y_pred_test)
15625/15625 [==============================] - 7s 432us/step
Predictions on the test set:
[[3.1388288]
 [2.7456193]
 [4.656323 ]
 ...
 [3.80933  ]
 [2.9484968]
 [2.980651 ]]
Training MSE: 0.6588238189223193
Training R-squared: 0.4645265585273568
3125/3125 [==============================] - 1s 433us/step
[[4.6070895]
 [4.155876 ]
 [4.70691  ]
 ...
 [3.6154668]
 [3.5599217]
 [4.668838 ]]
In [11]:
# Step 1: Define your information
student_id = '20952031' 
anonymized_name = '808' 
prediction_accuracy = r2_train  
algorithm_name = 'Neural Network Model' 

data = [
    [student_id],
    [anonymized_name],
    [prediction_accuracy],
    [algorithm_name]
] + [pred for pred in y_pred_test]


df = pd.DataFrame(data)

df.to_csv('final_perdictions_learn1.csv', header=False, index=False)

print("CSV file created successfully.")
CSV file created successfully.
In [12]:
print(y_pred_test)
[[4.6070895]
 [4.155876 ]
 [4.70691  ]
 ...
 [3.6154668]
 [3.5599217]
 [4.668838 ]]
In [13]:
# Create the Kaggle-style DataFrame
y_data_kaggle = []
for pred in y_pred_test:
    for val in pred:
        y_data_kaggle.append(val)

data = {
    "ID_num": range(1, len(y_data_kaggle) + 1),
    "prediction": y_data_kaggle
}

df = pd.DataFrame(data)

# Save to CSV
df.to_csv('kaggle_predictions1.csv', index=False)

print("CSV file created successfully in Kaggle format.")
CSV file created successfully in Kaggle format.
In [14]:
model.summary()
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 256)               256512    
                                                                 
 batch_normalization (BatchN  (None, 256)              1024      
 ormalization)                                                   
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 256)               0         
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 batch_normalization_1 (Batc  (None, 128)              512       
 hNormalization)                                                 
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 128)               0         
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 batch_normalization_2 (Batc  (None, 64)               256       
 hNormalization)                                                 
                                                                 
 leaky_re_lu_2 (LeakyReLU)   (None, 64)                0         
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 32)                2080      
                                                                 
 batch_normalization_3 (Batc  (None, 32)               128       
 hNormalization)                                                 
                                                                 
 leaky_re_lu_3 (LeakyReLU)   (None, 32)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 33        
                                                                 
=================================================================
Total params: 301,697
Trainable params: 300,737
Non-trainable params: 960
_________________________________________________________________
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 256)               256512    
                                                                 
 batch_normalization (BatchN  (None, 256)              1024      
 ormalization)                                                   
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 256)               0         
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 batch_normalization_1 (Batc  (None, 128)              512       
 hNormalization)                                                 
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 128)               0         
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 batch_normalization_2 (Batc  (None, 64)               256       
 hNormalization)                                                 
                                                                 
 leaky_re_lu_2 (LeakyReLU)   (None, 64)                0         
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 32)                2080      
                                                                 
 batch_normalization_3 (Batc  (None, 32)               128       
 hNormalization)                                                 
                                                                 
 leaky_re_lu_3 (LeakyReLU)   (None, 32)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 33        
                                                                 
=================================================================
Total params: 301,697
Trainable params: 300,737
Non-trainable params: 960
_________________________________________________________________
In [46]:
from keras.utils import plot_model

# Visualize model architecture
plot_model(model, to_file='model_structure.png', show_shapes=True, show_layer_names=True)
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.
In [47]:
# Display the model structure image
plt.figure(figsize=(10, 35))
img = plt.imread('model_structure.png')
plt.imshow(img)
plt.axis('off')
plt.show()
In [48]:
from wordcloud import WordCloud

for star_rating in range(1, 6):
    # Combine all 'pros' text for the current star rating
    review_text = ' '.join(df_train[df_train['rating'] == star_rating]['pros'].tolist())
    
    # Create a word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(review_text)
    
    # Display the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {star_rating}-Star Reviews')
    plt.axis('off')  # Hide axes for better visualization
    plt.show()
In [49]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Encode categorical features
label_encoder = LabelEncoder()
df_train['firm_encoded'] = label_encoder.fit_transform(df_train['firm'].fillna(''))
df_train['job_title_encoded'] = label_encoder.fit_transform(df_train['job_title'].fillna(''))

# Define features and target variable
feature_columns = ['year_review', 'firm_encoded', 'job_title_encoded']
target_column = 'rating'  # Predicting the 'rating'

# Prepare data for training and testing
X = df_train[feature_columns]
y = df_train[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42)
random_forest.fit(X_train, y_train)

# Extract feature importance scores
feature_importances = random_forest.feature_importances_

# Visualize feature importance
plt.figure(figsize=(8, 6))
plt.barh(feature_columns, feature_importances, color='skyblue')
plt.title("Feature Importance for Rating Prediction")
plt.xlabel("Importance Score")
plt.ylabel("Feature Names")
plt.tight_layout()  # Adjust layout to fit title and labels
plt.show()
In [50]:
correlation_matrix = X.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))  # Adjust figure size as needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix for Features')
plt.show()
In [51]:
# Plot the distribution of each feature in both the training and test datasets
# Visualize the distribution of each feature in the training and testing datasets
for feature_name in feature_columns:  # Renamed 'features' to 'feature_columns' for consistency
    plt.figure(figsize=(8, 6))
    
    # Plot the distribution of the feature in the training data
    plt.hist(
        X_train[feature_name], 
        bins=30, 
        alpha=0.7, 
        color='blue', 
        label='Training Data', 
        density=True
    )
    
    # Plot the distribution of the feature in the test data
    plt.hist(
        X_test[feature_name], 
        bins=30, 
        alpha=0.7, 
        color='red', 
        label='Testing Data', 
        density=True
    )
    
    # Add title and labels
    plt.title(f"Distribution of '{feature_name}' in Training vs Testing Data")
    plt.xlabel(f"{feature_name}")
    plt.ylabel("Density")
    plt.legend()  # Show the legend to distinguish between Train and Test data
    plt.tight_layout()  # Ensure the layout fits nicely
    plt.show()
In [53]:
# Predict ratings for both training and test sets
train_predictions = random_forest.predict(X_train)
test_predictions = random_forest.predict(X_test)

# Plot the distribution of actual vs. predicted ratings for training and test datasets
plt.figure(figsize=(12, 6))

# Plot for the training data
plt.subplot(1, 2, 1)
plt.hist(y_train, bins=30, alpha=0.7, color='blue', label='Actual (Training)', density=True)
plt.hist(train_predictions, bins=30, alpha=0.7, color='red', label='Predicted (Training)', density=True)
plt.title("Actual vs. Predicted Ratings: Training Data")
plt.xlabel("Rating")
plt.ylabel("Density")
plt.legend()

# Plot for the test data
plt.subplot(1, 2, 2)
plt.hist(y_test, bins=30, alpha=0.7, color='blue', label='Actual (Testing)', density=True)
plt.hist(test_predictions, bins=30, alpha=0.7, color='red', label='Predicted (Testing)', density=True)
plt.title("Actual vs. Predicted Ratings: Testing Data")
plt.xlabel("Rating")
plt.ylabel("Density")
plt.legend()

# Adjust layout for better visualization
plt.tight_layout()
plt.show()
In [54]:
# Calculate residuals (errors) for training and testing datasets
train_residuals = y_train - train_predictions
test_residuals = y_test - test_predictions

# Visualize the distribution of residuals
plt.figure(figsize=(12, 6))

# Residual plot for training data
plt.subplot(1, 2, 1)
plt.scatter(y_train, train_residuals, alpha=0.5, color='blue')
plt.axhline(0, color='black', linestyle='--', linewidth=1)  # Reference line at 0
plt.title("Actual Ratings vs Residuals: Training Data")
plt.xlabel("Actual Rating")
plt.ylabel("Residual (Error)")

# Residual plot for testing data
plt.subplot(1, 2, 2)
plt.scatter(y_test, test_residuals, alpha=0.5, color='red')
plt.axhline(0, color='black', linestyle='--', linewidth=1)  # Reference line at 0
plt.title("Actual Ratings vs Residuals: Testing Data")
plt.xlabel("Actual Rating")
plt.ylabel("Residual (Error)")

# Adjust layout and show the plots
plt.tight_layout()
plt.show()
In [55]:
def get_top_phrases_by_rating(df, rating_threshold):
    subset = df[df['rating'] >= rating_threshold]
    combined_text = ' '.join(subset['pros'] + ' ' + subset['cons'] + ' ' + subset['headline'])
    word_freq = pd.Series(combined_text.split()).value_counts().head(10)
    return word_freq

high_rating_phrases = get_top_phrases_by_rating(df_train, 4)
low_rating_phrases = get_top_phrases_by_rating(df_train, 2)

print("Top phrases in high-rated firms:")
print(high_rating_phrases)
print("\nTop phrases in low-rated firms:")
print(low_rating_phrases)
Top phrases in high-rated firms:
work           248368
good           202102
great          197599
company        124557
people          84598
place           76033
opportunity     65366
working         59299
hour            55052
lot             54680
Name: count, dtype: int64

Top phrases in low-rated firms:
work           358793
good           309259
great          250627
company        173852
people         135448
place           98262
opportunity     91781
management      88760
hour            88193
working         87336
Name: count, dtype: int64
In [56]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

# Load data
df = pd.read_csv(large_train)

# Categorize firms by average rating (low: 1-2, medium: 3-4, high: 5)
df['rating_group'] = pd.cut(df['rating'], bins=[0, 2, 4, 5], labels=['Low', 'Medium', 'High'])

# Function to extract top words/themes
def extract_themes(text_column, top_n=10):
    vectorizer = CountVectorizer(stop_words='english')
    word_counts = vectorizer.fit_transform(text_column.dropna())
    word_sum = word_counts.sum(axis=0)
    words_freq = [(word, word_sum[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)[:top_n]
    return words_freq

# Analyze themes for pros and cons by rating group
themes = {}
for group in df['rating_group'].unique():
    group_data = df[df['rating_group'] == group]
    pros_themes = extract_themes(group_data['pros'], top_n=10)
    cons_themes = extract_themes(group_data['cons'], top_n=10)
    themes[group] = {'pros': pros_themes, 'cons': cons_themes}

# Display results in a table
for group, data in themes.items():
    print(f"\n=== {group} Rated Firms ===")
    print("Top Pros:", data['pros'])
    print("Top Cons:", data['cons'])

# Optional: Create word clouds for visualization
def plot_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(text))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16)
    plt.show()

for group in df['rating_group'].unique():
    group_data = df[df['rating_group'] == group]
    plot_wordcloud(group_data['pros'].dropna(), f"{group} Rated Firms - Pros")
    plot_wordcloud(group_data['cons'].dropna(), f"{group} Rated Firms - Cons")
=== Medium Rated Firms ===
Top Pros: [('good', 135574), ('work', 116435), ('great', 86490), ('people', 55525), ('benefits', 39715), ('company', 35581), ('environment', 31546), ('opportunities', 29698), ('working', 28386), ('life', 27891)]
Top Cons: [('work', 73507), ('management', 37321), ('hours', 36969), ('long', 27701), ('pay', 26397), ('company', 24463), ('life', 22969), ('people', 21708), ('time', 20808), ('balance', 20277)]

=== Low Rated Firms ===
Top Pros: [('good', 24406), ('work', 17873), ('great', 11024), ('people', 10598), ('benefits', 7689), ('company', 6677), ('nice', 5289), ('pay', 5244), ('working', 5111), ('job', 4347)]
Top Cons: [('work', 27668), ('management', 26298), ('people', 15191), ('company', 12934), ('pay', 10983), ('staff', 10689), ('managers', 10510), ('employees', 10105), ('time', 9444), ('hours', 9041)]

=== High Rated Firms ===
Top Pros: [('work', 62408), ('great', 58361), ('good', 48450), ('people', 28143), ('company', 24747), ('culture', 19808), ('benefits', 18672), ('environment', 18437), ('opportunities', 15378), ('working', 14786)]
Top Cons: [('work', 26527), ('hours', 15501), ('long', 12257), ('cons', 12144), ('company', 11643), ('working', 9138), ('time', 8740), ('life', 7328), ('think', 7211), ('good', 6785)]